import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from warnings import filterwarnings
filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\car data.csv")
data
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ritz | 2014 | 3.35 | 5.59 | 27000 | Petrol | Dealer | Manual | 0 |
| 1 | sx4 | 2013 | 4.75 | 9.54 | 43000 | Diesel | Dealer | Manual | 0 |
| 2 | ciaz | 2017 | 7.25 | 9.85 | 6900 | Petrol | Dealer | Manual | 0 |
| 3 | wagon r | 2011 | 2.85 | 4.15 | 5200 | Petrol | Dealer | Manual | 0 |
| 4 | swift | 2014 | 4.60 | 6.87 | 42450 | Diesel | Dealer | Manual | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 296 | city | 2016 | 9.50 | 11.60 | 33988 | Diesel | Dealer | Manual | 0 |
| 297 | brio | 2015 | 4.00 | 5.90 | 60000 | Petrol | Dealer | Manual | 0 |
| 298 | city | 2009 | 3.35 | 11.00 | 87934 | Petrol | Dealer | Manual | 0 |
| 299 | city | 2017 | 11.50 | 12.50 | 9000 | Diesel | Dealer | Manual | 0 |
| 300 | brio | 2016 | 5.30 | 5.90 | 5464 | Petrol | Dealer | Manual | 0 |
301 rows × 9 columns
data.head()
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ritz | 2014 | 3.35 | 5.59 | 27000 | Petrol | Dealer | Manual | 0 |
| 1 | sx4 | 2013 | 4.75 | 9.54 | 43000 | Diesel | Dealer | Manual | 0 |
| 2 | ciaz | 2017 | 7.25 | 9.85 | 6900 | Petrol | Dealer | Manual | 0 |
| 3 | wagon r | 2011 | 2.85 | 4.15 | 5200 | Petrol | Dealer | Manual | 0 |
| 4 | swift | 2014 | 4.60 | 6.87 | 42450 | Diesel | Dealer | Manual | 0 |
data.tail()
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 296 | city | 2016 | 9.50 | 11.6 | 33988 | Diesel | Dealer | Manual | 0 |
| 297 | brio | 2015 | 4.00 | 5.9 | 60000 | Petrol | Dealer | Manual | 0 |
| 298 | city | 2009 | 3.35 | 11.0 | 87934 | Petrol | Dealer | Manual | 0 |
| 299 | city | 2017 | 11.50 | 12.5 | 9000 | Diesel | Dealer | Manual | 0 |
| 300 | brio | 2016 | 5.30 | 5.9 | 5464 | Petrol | Dealer | Manual | 0 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 301 entries, 0 to 300 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Car_Name 301 non-null object 1 Year 301 non-null int64 2 Selling_Price 301 non-null float64 3 Present_Price 301 non-null float64 4 Kms_Driven 301 non-null int64 5 Fuel_Type 301 non-null object 6 Seller_Type 301 non-null object 7 Transmission 301 non-null object 8 Owner 301 non-null int64 dtypes: float64(2), int64(3), object(4) memory usage: 21.3+ KB
data.isnull().sum()
Car_Name 0 Year 0 Selling_Price 0 Present_Price 0 Kms_Driven 0 Fuel_Type 0 Seller_Type 0 Transmission 0 Owner 0 dtype: int64
data.duplicated().sum()
2
data.drop_duplicates(inplace=True)
data.duplicated().sum()
0
data.describe()
| Year | Selling_Price | Present_Price | Kms_Driven | Owner | |
|---|---|---|---|---|---|
| count | 299.000000 | 299.000000 | 299.000000 | 299.000000 | 299.000000 |
| mean | 2013.615385 | 4.589632 | 7.541037 | 36916.752508 | 0.043478 |
| std | 2.896868 | 4.984240 | 8.567887 | 39015.170352 | 0.248720 |
| min | 2003.000000 | 0.100000 | 0.320000 | 500.000000 | 0.000000 |
| 25% | 2012.000000 | 0.850000 | 1.200000 | 15000.000000 | 0.000000 |
| 50% | 2014.000000 | 3.510000 | 6.100000 | 32000.000000 | 0.000000 |
| 75% | 2016.000000 | 6.000000 | 9.840000 | 48883.500000 | 0.000000 |
| max | 2018.000000 | 35.000000 | 92.600000 | 500000.000000 | 3.000000 |
#VISUALIZATION
plt.bar(data['Seller_Type'],data['Present_Price'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='Car_Name',y='Present_Price',color='Car_Name')
fig.show()
fig=px.violin(data,x='Car_Name',y='Fuel_Type',color='Car_Name')
fig.show()
fig=px.bar(data,x='Seller_Type',y='Owner',color='Owner')
fig.show()
plt.bar(data['Fuel_Type'],data['Transmission'])
plt.scatter(data['Seller_Type'],data['Fuel_Type'],color='red')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
sns.countplot(x='Fuel_Type', data=data, color='b')
plt.title('Car ICE')
plt.show()
plt.figure(figsize=(10,4))
top_car = data['Transmission'].value_counts().nlargest(10)
sns.countplot(y=data.Transmission, order=top_car.index, color='red')
<AxesSubplot:xlabel='count', ylabel='Transmission'>
sns.lineplot(x='Year', y='Present_Price', data=data).set_title('Variation of present price with year')
Text(0.5, 1.0, 'Variation of present price with year')
sns.barplot(data['Present_Price'],data['Seller_Type'],color='r')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Year', y='Kms_Driven')
plt.title('YEAR vs. KMS_DRIVEN')
plt.xlabel('Year')
plt.ylabel('Kms_Driven')
plt.show()
sns.displot(data["Kms_Driven"])
<seaborn.axisgrid.FacetGrid at 0x1c1c5da4280>
sns.relplot(x='Year',y='Present_Price',data=data)
<seaborn.axisgrid.FacetGrid at 0x1c1c5f2cac0>
sns.countplot(x='Fuel_Type',data=data)
plt.xticks(rotation=90)
(array([0, 1, 2]), [Text(0, 0, 'Petrol'), Text(1, 0, 'Diesel'), Text(2, 0, 'CNG')])
sns.boxplot(x='Selling_Price',y='Fuel_Type',data=data)
<AxesSubplot:xlabel='Selling_Price', ylabel='Fuel_Type'>
sns.violinplot(x='Owner',y='Kms_Driven',data=data)
<AxesSubplot:xlabel='Owner', ylabel='Kms_Driven'>
#MODEL BUILDING
data['Age']=2024-data['Year']
data.drop('Year',axis=1,inplace=True)
data.head()
| Car_Name | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | Age | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ritz | 3.35 | 5.59 | 27000 | Petrol | Dealer | Manual | 0 | 10 |
| 1 | sx4 | 4.75 | 9.54 | 43000 | Diesel | Dealer | Manual | 0 | 11 |
| 2 | ciaz | 7.25 | 9.85 | 6900 | Petrol | Dealer | Manual | 0 | 7 |
| 3 | wagon r | 2.85 | 4.15 | 5200 | Petrol | Dealer | Manual | 0 | 13 |
| 4 | swift | 4.60 | 6.87 | 42450 | Diesel | Dealer | Manual | 0 | 10 |
data.rename(columns={'Selling_Price':'Selling_Price(lacs)','Present_Price':'Present_Price(lacs)','Owner':'past_owner'},inplace=True)
data.columns
Index(['Car_Name', 'Selling_Price(lacs)', 'Present_Price(lacs)', 'Kms_Driven',
'Fuel_Type', 'Seller_Type', 'Transmission', 'past_owner', 'Age'],
dtype='object')
cat_cols=['Fuel_Type','Seller_Type','Transmission','past_owner']
i=0
while i<4:
fig=plt.figure(figsize=[10,4])
plt.subplot(1,2,1)
sns.countplot(x=cat_cols[i],data=data)
i+=1
plt.subplot(1,2,2)
sns.countplot(x=cat_cols[i],data=data)
i+=1
plt.show()
num_cols=['Selling_Price(lacs)','Present_Price(lacs)','Kms_Driven','Age']
i=0
while i<4:
fig=plt.figure(figsize=[13,3])
plt.subplot(1,2,1)
sns.boxplot(x=num_cols[i],data=data)
i += 1
plt.subplot(1,2,2)
sns.boxplot(x=num_cols[i],data=data)
i += 1
plt.show()
data.drop(labels = 'Car_Name', axis = 1, inplace = True)
data = pd.get_dummies(data = data,drop_first = True)
sns.heatmap(data.corr(), annot=True, cmap="RdBu")
plt.show()
data.corr()['Selling_Price(lacs)']
Selling_Price(lacs) 1.000000 Present_Price(lacs) 0.876378 Kms_Driven 0.028566 past_owner -0.087880 Age -0.234369 Fuel_Type_Diesel 0.543541 Fuel_Type_Petrol -0.531636 Seller_Type_Individual -0.553851 Transmission_Manual -0.348869 Name: Selling_Price(lacs), dtype: float64
x=data.drop('Selling_Price(lacs)',axis=1)
y=data['Selling_Price(lacs)']
x.head()
| Present_Price(lacs) | Kms_Driven | past_owner | Age | Fuel_Type_Diesel | Fuel_Type_Petrol | Seller_Type_Individual | Transmission_Manual | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5.59 | 27000 | 0 | 10 | 0 | 1 | 0 | 1 |
| 1 | 9.54 | 43000 | 0 | 11 | 1 | 0 | 0 | 1 |
| 2 | 9.85 | 6900 | 0 | 7 | 0 | 1 | 0 | 1 |
| 3 | 4.15 | 5200 | 0 | 13 | 0 | 1 | 0 | 1 |
| 4 | 6.87 | 42450 | 0 | 10 | 1 | 0 | 0 | 1 |
y.head()
0 3.35 1 4.75 2 7.25 3 2.85 4 4.60 Name: Selling_Price(lacs), dtype: float64
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2,random_state=1)
print("x train: ",x_train.shape)
print("x test: ",x_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)
x train: (239, 8) x test: (60, 8) y train: (239,) y test: (60,)
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("coefficients:",model.coef_)
print("intercept:", model.intercept_)
coefficients: [ 5.29181181e-01 -5.99046152e-06 -9.44252885e-01 -4.18029869e-01 2.12111606e+00 4.98109745e-01 -4.73063346e-01 -5.27393308e-01] intercept: 5.087110830354567
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test,y_pred)
print("mean squared error:",mse)
mean squared error: 5.132840514762981
error=y_test,-y_pred
print(error)
(175 0.38
289 10.11
52 18.00
148 0.52
216 2.90
199 0.12
276 8.65
63 23.50
28 1.95
203 2.95
114 1.15
121 1.05
11 6.85
246 3.75
112 1.15
297 4.00
74 4.90
107 1.25
267 8.35
109 1.20
190 0.20
12 7.50
71 4.50
19 2.65
89 4.75
110 1.20
300 5.30
178 0.35
122 1.05
252 5.40
86 35.00
298 3.35
134 0.65
287 5.75
133 0.72
79 14.50
255 3.00
140 0.60
91 11.25
191 0.20
4 4.60
195 0.18
156 0.48
249 5.25
81 4.75
16 7.25
170 0.40
129 0.78
59 19.99
187 0.25
281 2.10
213 2.90
176 0.35
60 6.95
152 0.50
214 5.25
193 0.20
285 7.40
174 0.38
141 0.60
Name: Selling_Price(lacs), dtype: float64, array([ 8.82443400e-01, -8.84467711e+00, -1.46540728e+01, 1.03979470e+00,
-3.98298018e+00, 2.53231323e+00, -8.34385906e+00, -2.21937617e+01,
-1.02884763e+00, -1.73742233e+00, -1.49855359e+00, 9.71867882e-02,
-8.10595189e+00, -3.42494291e+00, -1.63256684e+00, -4.05829972e+00,
-6.72891454e+00, -6.90350211e-01, -8.19447911e+00, -2.37055610e+00,
2.06625579e+00, -6.99315577e+00, -7.58105566e+00, -3.18001822e+00,
-4.21517490e+00, -2.14652470e+00, -4.80302540e+00, -1.09321399e+00,
-1.82372418e+00, -5.24370518e+00, -4.93633368e+01, -4.08160698e+00,
-2.01650415e+00, -5.52823031e+00, -1.74025186e+00, -1.78569533e+01,
-2.55105014e+00, -4.29758011e-01, -1.06661636e+01, 1.22397566e+00,
-5.88171452e+00, -7.82166925e-01, -1.84387213e+00, -5.63352745e+00,
-6.83301858e+00, -8.37875925e+00, 2.53216644e-02, -2.07311472e+00,
-2.18116746e+01, -6.46229964e-02, -1.25281187e+00, -3.02725664e+00,
4.73697981e-01, -1.00679360e+01, -1.44188632e-02, -5.02745877e+00,
7.57620831e-01, -6.64881130e+00, -9.72273736e-01, -1.54406068e+00]))
error=y_test-y_pred
data=pd.DataFrame({'y_pred':y_pred, 'error': error})
sns.regplot(x='y_pred', y='error', data=data)
plt.show()